library(dplyr)
library(tidyr)
library(caret)
library(pamr)

#set sample directory 
in_dir <- "/Users/cheng/OneDrive/Desktop/character/character/3_prepare_samples/output_sample_data/"
in_files <- dir(in_dir)
file_names <- file.path(in_dir,in_files)

# set output variables
totalACC <- list()
precALL<- list()
recallALL <- list()
mGenes <- list()
fGenes <- list()
fsens <- list()
mspec <- list()
negPred <- list()

decade_ACC <- list()
decade_PREC <- list()
decade_RECALL <- list()
decade_MGENES <- list()
decade_FGENES <- list()
decade_SENS <-list()
decade_SPEC <-list()
decade_NEGPRED <- list()

#The following code is awful, will refactor when I get the chance
#Produces models for all 15 samples, for each decade.

for(q in 1:length(file_names)){ 
  load(file_names[q])
  
  ##Create a model on 1.5 centuries of data
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  totalACC[[q]] <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(model = q)
  
  #Collect Precision & Recall Statistics & Sensitivity & Specifcity
  ##################################################################
  cmALL <- confusionMatrix(nscClass)
  precALL[[q]] <- as.data.frame(precision(cmALL$table))%>%
    mutate(model = q)
  recallALL[[q]] <- as.data.frame(recall(cmALL$table))%>%
    mutate(model = q)
  fsens[[q]] <- as.data.frame(sensitivity(cmALL$table))%>%
    mutate(model = q)
  mspec[[q]]<- as.data.frame(specificity(cmALL$table))%>%
    mutate(model = q)
  negPred[[q]] <- as.data.frame(negPredValue(cmALL$table))%>%
    mutate(model = q)
  
  #Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  mGenes[[q]] <- filter(genes, genes$M.score > 0) %>%
    mutate(model = q)
  fGenes[[q]] <- filter(genes, genes$F.score > 0)%>%
    mutate(model = q)
  
  
  ##Create a model on 1850s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1850) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1850 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1850) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1850 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1850) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1850 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1850) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1850 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1850) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1850 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1850) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1850 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1850) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1850 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1850) %>%
    mutate(sampleNUM = q)
  FGENES1850 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1850) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1860s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1860) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1860 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1860) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1860 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1860) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1860 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1860) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1860 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1860) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1860 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1860) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1860 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1860) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1860 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1860) %>%
    mutate(sampleNUM = q)
  FGENES1860 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1860) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1870s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1870) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1870 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1870) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1870 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1870) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1870 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1870) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1870 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1870) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1870 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1870) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1870 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1870) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1870 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1870) %>%
    mutate(sampleNUM = q)
  FGENES1870 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1870) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1880s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1880) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1880 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1880) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1880 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1880) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1880 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1880) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1880 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1880) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1880 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1880) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1880 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1880) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1880 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1880) %>%
    mutate(sampleNUM = q)
  FGENES1880 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1880) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1890s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1890) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1890 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1890) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1890 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1890) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1890 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1890) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1890 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1890) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1890 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1890) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1890 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1890) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1890 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1890) %>%
    mutate(sampleNUM = q)
  FGENES1890 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1890) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1900s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1900) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1900 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1900) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1900 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1900) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1900 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1900) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1900 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1900) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1900 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1900) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1900 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1900) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1900 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1900) %>%
    mutate(sampleNUM = q)
  FGENES1900 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1900) %>%
    mutate(sampleNUM = q)  
  
  ##Create a model on 1910s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1910) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1910 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1910) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1910 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1910) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1910 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1910) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1910 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1910) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1910 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1910) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1910 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1910) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1910 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1910) %>%
    mutate(sampleNUM = q)
  FGENES1910 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1910) %>%
    mutate(sampleNUM = q)  
  
  ##Create a model on 1920s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1920) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1920 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1920) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1920 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1920) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1920 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1920) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1920 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1920) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1920 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1920) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1920 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1920) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1920 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1920) %>%
    mutate(sampleNUM = q)
  FGENES1920 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1920) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1930s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1930) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1930 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1930) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1930 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1930) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1930 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1930) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1930 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1930) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1930 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1930) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1930 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1930) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1930 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1930) %>%
    mutate(sampleNUM = q)
  FGENES1930 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1930) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1940s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1940) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1940 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1940) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1940 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1940) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1940 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1940) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1940 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1940) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1940 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1940) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1940 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1940) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1940 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1940) %>%
    mutate(sampleNUM = q)
  FGENES1940 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1940) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1950s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1950) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1950 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1950) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1950 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1950) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1950 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1950) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1950 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1950) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1950 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1950) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1950 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1950) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1950 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1950) %>%
    mutate(sampleNUM = q)
  FGENES1950 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1950) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1960s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1960) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1960 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1960) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1960 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1960) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1960 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1960) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1960 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1960) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1960 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1960) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1960 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1960) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1960 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1960) %>%
    mutate(sampleNUM = q)
  FGENES1960 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1960) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1970s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1970) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1970 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1970) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1970 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1970) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1970 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1970) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1970 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1970) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1970 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1970) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1970 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1970) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1970 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1970) %>%
    mutate(sampleNUM = q)
  FGENES1970 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1970) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1980s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1980) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1980 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1980) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1980 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1980) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1980 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1980) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1980 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1980) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1980 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1980) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1980 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1980) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1980 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1980) %>%
    mutate(sampleNUM = q)
  FGENES1980 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1980) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 1990s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 1990) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC1990 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 1990) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC1990 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 1990) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL1990 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 1990) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS1990 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 1990) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC1990 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 1990) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED1990 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 1990) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES1990 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 1990) %>%
    mutate(sampleNUM = q)
  FGENES1990 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 1990) %>%
    mutate(sampleNUM = q) 
  
  ##Create a model on 2000s
  ##################################################################
  ##################################################################
  classingData <- test_allData %>%
    ungroup() %>%
    filter(pub_decade %in% 2000) %>%
    select(-one_of("file_name", "pub_decade",  "auth_gender")) %>%
    rename(Class = gender)
  
  justData <- classingData %>%
    select(-one_of("Class"))
  
  down_train <- downSample(x=justData, y=as.factor(classingData$Class))
  ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE)
  set.seed(1996)
  
  nscClass <- train(Class ~ ., data = down_train, 
                    method = "pam", 
                    preProcess = c("center","scale"), 
                    trControl = ctrl)
  
  
  #Collect Statistics
  ##################################################################
  ACC2000 <- as.data.frame(nscClass$results$Accuracy[1]) %>%
    mutate(decade = 2000) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  cm <- confusionMatrix(nscClass)
  
  PREC2000 <- as.data.frame(precision(cm$table)) %>%
    mutate(decade = 2000) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  RECALL2000 <- as.data.frame(recall(cm$table)) %>%
    mutate(decade = 2000) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SENS2000 <-as.data.frame(sensitivity(cm$table)) %>%
    mutate(decade = 2000) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  SPEC2000 <-as.data.frame(specificity(cm$table)) %>%
    mutate(decade = 2000) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  NEGPRED2000 <- as.data.frame(negPredValue(cm$table)) %>%
    mutate(decade = 2000) %>%
    rename(accuracy = 1) %>% 
    mutate(model = q)
  
  
  ##Collect Genes
  ##################################################################
  helix <- list(x=t(justData), y=factor(down_train$Class), geneid=colnames(justData))
  
  genes <- data.frame(pamr.listgenes(nscClass$finalModel, 
                                     helix, 
                                     nscClass$finalModel$threshold, 
                                     pamr.cv(nscClass$finalModel,helix)),
                      stringsAsFactors = F)
  
  MGENES2000 <- filter(genes, genes$M.score > 0) %>%
    mutate(decade = 2000) %>%
    mutate(sampleNUM = q)
  FGENES2000 <- filter(genes, genes$F.score > 0) %>%
    mutate(decade = 2000) %>%
    mutate(sampleNUM = q) 
  
  ###Send to Dataframes
  #################################################################
  #################################################################
  
  decade_ACC[[q]] <- bind_rows(ACC1850, ACC1860, ACC1870, ACC1880, ACC1890, ACC1900,
                               ACC1910, ACC1920, ACC1930, ACC1940, ACC1950, ACC1960,
                               ACC1970, ACC1980, ACC1990, ACC2000)
  decade_PREC[[q]] <- bind_rows(PREC1850, PREC1860, PREC1870, PREC1880, PREC1890, PREC1900,
                                PREC1910, PREC1920, PREC1930, PREC1940, PREC1950, PREC1960,
                                PREC1970, PREC1980, PREC1990, PREC2000)
  decade_RECALL[[q]] <- bind_rows(RECALL1850, RECALL1860, RECALL1870, RECALL1880, RECALL1890, RECALL1900,
                                  RECALL1910, RECALL1920, RECALL1930, RECALL1940, RECALL1950, RECALL1960,
                                  RECALL1970, RECALL1980, RECALL1990, RECALL2000)
  decade_MGENES[[q]] <- bind_rows(MGENES1850, MGENES1860, MGENES1870, MGENES1880, MGENES1890, MGENES1900,
                                  MGENES1910, MGENES1920, MGENES1930, MGENES1940, MGENES1950, MGENES1960,
                                  MGENES1970, MGENES1980, MGENES1990, MGENES2000)
  decade_FGENES[[q]] <- bind_rows(FGENES1850, FGENES1860, FGENES1870, FGENES1880, FGENES1890, FGENES1900,
                                  FGENES1910, FGENES1920, FGENES1930, FGENES1940, FGENES1950, FGENES1960,
                                  FGENES1970, FGENES1980, FGENES1990, FGENES2000)
  decade_SENS[[q]] <- bind_rows(SENS1850, SENS1860, SENS1870, SENS1880, SENS1890, SENS1900,
                                SENS1910, SENS1920, SENS1930, SENS1940, SENS1950, SENS1960,
                                SENS1970, SENS1980, SENS1990, SENS2000)
  decade_SPEC[[q]] <- bind_rows(SPEC1850, SPEC1860, SPEC1870, SPEC1880, SPEC1890, SPEC1900,
                                SPEC1910, SPEC1920, SPEC1930, SPEC1940, SPEC1950, SPEC1960,
                                SPEC1970, SPEC1980, SPEC1990, SPEC2000)
  decade_NEGPRED[[q]] <- bind_rows(NEGPRED1850, NEGPRED1860, NEGPRED1870, NEGPRED1880, NEGPRED1890, NEGPRED1900,
                                   NEGPRED1910, NEGPRED1920, NEGPRED1930, NEGPRED1940, NEGPRED1950, NEGPRED1960,
                                   NEGPRED1970, NEGPRED1980, NEGPRED1990, NEGPRED2000)
}


##Bind Rows and Collect Data
##################################################################
##################################################################
allACC <- totalACC %>%
  bind_rows()
allPREC <- precALL %>%
  bind_rows()
allRECALL <- recallALL %>%
  bind_rows()
allSENS <- fsens %>%
  bind_rows()
allSPEC <- mspec %>%
  bind_rows()
allNEGPRED <- negPred %>%
  bind_rows
mgenesALL <- mGenes %>% 
  bind_rows()
fgenesALL <- fGenes %>%
  bind_rows()

decadeACC <- decadeACC %>%
  bind_rows() %>%
  arrange(decade)

#save figure 4 data#
write_csv(decadeACC, "/Users/cheng/OneDrive/Desktop/character/character/4_build_models")

decadePREC <- decade_PREC %>%
  bind_rows()
decadeRECALL <- decade_RECALL %>%
  bind_rows()
decadeSENS <- decade_SENS %>%
  bind_rows()
decadeSPEC <- decade_SPEC %>%
  bind_rows()
decadeNEGPRED <- decade_NEGPRED %>%
  bind_rows()
mgenesDECADE <- decade_MGENES %>%
  bind_rows()
fgenesDECADE <- decade_FGENES %>%
  bind_rows()


###Gene Layout
############################################################################
############################################################################

#Look for most common genes in 15 models
########################################
commonMGENES <- mgenesDECADE %>%
  group_by(id,decade) %>%
  summarise(frequency = n()) %>% 
  filter(frequency >= 15)

commonFGENES <- fgenesDECADE%>%
  group_by(id,decade) %>%
  summarise(frequency = n()) %>% 
  filter(frequency >= 15)

write.csv(commonMGENES, file = "/output_gene_data/mGenes.csv")
write.csv(commonFGENES, file = "/output_gene_data/fGenes.csv")
